inline void sqrPre(uint32_t y[16], const uint32_t x[8])
{
	uint64_t t;
	uint64_t b[14];

	t = x[0]; t *= x[1];
	b[0] = t & 0xffffffff;
	b[1] = t >> 32;
	t = x[1]; t *= x[2];
	b[2] = t & 0xffffffff;
	b[3] = t >> 32;
	t = x[2]; t *= x[3];
	b[4] = t & 0xffffffff;
	b[5] = t >> 32;
	t = x[3]; t *= x[4];
	b[6] = t & 0xffffffff;
	b[7] = t >> 32;
	t = x[4]; t *= x[5];
	b[8] = t & 0xffffffff;
	b[9] = t >> 32;
	t = x[5]; t *= x[6];
	b[10] = t & 0xffffffff;
	b[11] = t >> 32;
	t = x[6]; t *= x[7];
	b[12] = t & 0xffffffff;
	b[13] = t >> 32;

	t = x[0]; t *= x[2];
	b[1] += t & 0xffffffff;
	b[2] += t >> 32;
	t = x[1]; t *= x[3];
	b[3] += t & 0xffffffff;
	b[4] += t >> 32;
	t = x[2]; t *= x[4];
	b[5] += t & 0xffffffff;
	b[6] += t >> 32;
	t = x[3]; t *= x[5];
	b[7] += t & 0xffffffff;
	b[8] += t >> 32;
	t = x[4]; t *= x[6];
	b[9] += t & 0xffffffff;
	b[10] += t >> 32;
	t = x[5]; t *= x[7];
	b[11] += t & 0xffffffff;
	b[12] += t >> 32;

	t = x[0]; t *= x[3];
	b[2] += t & 0xffffffff;
	b[3] += t >> 32;
	t = x[1]; t *= x[4];
	b[4] += t & 0xffffffff;
	b[5] += t >> 32;
	t = x[2]; t *= x[5];
	b[6] += t & 0xffffffff;
	b[7] += t >> 32;
	t = x[3]; t *= x[6];
	b[8] += t & 0xffffffff;
	b[9] += t >> 32;
	t = x[4]; t *= x[7];
	b[10] += t & 0xffffffff;
	b[11] += t >> 32;

	t = x[0]; t *= x[4];
	b[3] += t & 0xffffffff;
	b[4] += t >> 32;
	t = x[1]; t *= x[5];
	b[5] += t & 0xffffffff;
	b[6] += t >> 32;
	t = x[2]; t *= x[6];
	b[7] += t & 0xffffffff;
	b[8] += t >> 32;
	t = x[3]; t *= x[7];
	b[9] += t & 0xffffffff;
	b[10] += t >> 32;

	t = x[0]; t *= x[5];
	b[4] += t & 0xffffffff;
	b[5] += t >> 32;
	t = x[1]; t *= x[6];
	b[6] += t & 0xffffffff;
	b[7] += t >> 32;
	t = x[2]; t *= x[7];
	b[8] += t & 0xffffffff;
	b[9] += t >> 32;

	t = x[0]; t *= x[6];
	b[5] += t & 0xffffffff;
	b[6] += t >> 32;
	t = x[1]; t *= x[7];
	b[7] += t & 0xffffffff;
	b[8] += t >> 32;

	t = x[0]; t *= x[7];
	b[6] += t & 0xffffffff;
	b[7] += t >> 32;

	t = x[0]; t *= t;
	y[0] = uint32_t(t);
	uint64_t H = 0;
	H = (H >> 32) + (t >> 32) + b[0] * 2;
	y[1] = uint32_t(H);
	t = x[1]; t *= t;
	H = (H >> 32) + (t & 0xffffffff) + b[1] * 2;
	y[2] = uint32_t(H);
	H = (H >> 32) + (t >> 32) + b[2] * 2;
	y[3] = uint32_t(H);
	t = x[2]; t *= t;
	H = (H >> 32) + (t & 0xffffffff) + b[3] * 2;
	y[4] = uint32_t(H);
	H = (H >> 32) + (t >> 32) + b[4] * 2;
	y[5] = uint32_t(H);
	t = x[3]; t *= t;
	H = (H >> 32) + (t & 0xffffffff) + b[5] * 2;
	y[6] = uint32_t(H);
	H = (H >> 32) + (t >> 32) + b[6] * 2;
	y[7] = uint32_t(H);
	t = x[4]; t *= t;
	H = (H >> 32) + (t & 0xffffffff) + b[7] * 2;
	y[8] = uint32_t(H);
	H = (H >> 32) + (t >> 32) + b[8] * 2;
	y[9] = uint32_t(H);
	t = x[5]; t *= t;
	H = (H >> 32) + (t & 0xffffffff) + b[9] * 2;
	y[10] = uint32_t(H);
	H = (H >> 32) + (t >> 32) + b[10] * 2;
	y[11] = uint32_t(H);
	t = x[6]; t *= t;
	H = (H >> 32) + (t & 0xffffffff) + b[11] * 2;
	y[12] = uint32_t(H);
	H = (H >> 32) + (t >> 32) + b[12] * 2;
	y[13] = uint32_t(H);
	t = x[7]; t *= t;
	H = (H >> 32) + (t & 0xffffffff) + b[13] * 2;
	y[14] = uint32_t(H);
	y[15] = (H >> 32) + (t >> 32);
}
